In [2]:
%matplotlib inline
import numpy as np
import pandas as pd

In [2]:
%load ../ud120-projects/final_project/poi_id.py

In [4]:
#%%writefile ../ud120-projects/final_project/poi_id.py
#!/usr/bin/python

import matplotlib.pyplot as plt
import sys
import pickle
sys.path.append("../ud120-projects/tools/")

from feature_format import featureFormat
from feature_format import targetFeatureSplit

### features_list is a list of strings, each of which is a feature name
### first feature must be "poi", as this will be singled out as the label
features_list = ['poi', 'salary', 'deferral_payments', 'total_payments', 'loan_advances',
                 'bonus', 'restricted_stock_deferred', 'deferred_income', 'total_stock_value',
                 'expenses', 'exercised_stock_options', 'other', 'long_term_incentive',
                 'restricted_stock', 'director_fees', 'to_messages', 
                 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 
                 'poi', 'shared_receipt_with_poi']


### load the dictionary containing the dataset
data_dict = pickle.load(open("../ud120-projects/final_project/final_project_dataset.pkl", "r") )

### we suggest removing any outliers before proceeding further

### if you are creating any new features, you might want to do that here
### store to my_dataset for easy export below
my_dataset = data_dict



### these two lines extract the features specified in features_list
### and extract them from data_dict, returning a numpy array
data = featureFormat(my_dataset, features_list)



### if you are creating new features, could also do that here



### split into labels and features (this line assumes that the first
### feature in the array is the label, which is why "poi" must always
### be first in features_list
labels, features = targetFeatureSplit(data)



### machine learning goes here!
### please name your classifier clf for easy export below

clf = None    ### get rid of this line!  just here to keep code from crashing out-of-box


### dump your classifier, dataset and features_list so 
### anyone can run/check your results
pickle.dump(clf, open("../ud120-projects/final_project/my_classifier.pkl", "w") )
pickle.dump(data_dict, open("../ud120-projects/final_project/my_dataset.pkl", "w") )
pickle.dump(features_list, open("../ud120-projects/final_project/my_feature_list.pkl", "w") )

In [55]:
data_dict = pickle.load(open("../ud120-projects/final_project/my_dataset.pkl", "r") )

In [56]:
#[v for k,v in data_dict.items()][0]

In [57]:
data_dict.items()[0]


Out[57]:
('METTS MARK',
 {'bonus': 600000,
  'deferral_payments': 'NaN',
  'deferred_income': 'NaN',
  'director_fees': 'NaN',
  'email_address': 'mark.metts@enron.com',
  'exercised_stock_options': 'NaN',
  'expenses': 94299,
  'from_messages': 29,
  'from_poi_to_this_person': 38,
  'from_this_person_to_poi': 1,
  'loan_advances': 'NaN',
  'long_term_incentive': 'NaN',
  'other': 1740,
  'poi': False,
  'restricted_stock': 585062,
  'restricted_stock_deferred': 'NaN',
  'salary': 365788,
  'shared_receipt_with_poi': 702,
  'to_messages': 807,
  'total_payments': 1061827,
  'total_stock_value': 585062})

In [58]:
df = pd.DataFrame.from_dict(my_dataset, orient='index')

In [59]:
#%load ../ud120-projects/tools/feature_format.py

In [60]:
df.head()


Out[60]:
salary to_messages deferral_payments total_payments exercised_stock_options bonus restricted_stock shared_receipt_with_poi restricted_stock_deferred total_stock_value ... loan_advances from_messages other from_this_person_to_poi poi director_fees deferred_income long_term_incentive email_address from_poi_to_this_person
ALLEN PHILLIP K 201955 2902 2869717 4484442 1729541 4175000 126027 1407 -126027 1729541 ... NaN 2195 152 65 False NaN -3081055 304805 phillip.allen@enron.com 47
BADUM JAMES P NaN NaN 178980 182466 257817 NaN NaN NaN NaN 257817 ... NaN NaN NaN NaN False NaN NaN NaN NaN NaN
BANNANTINE JAMES M 477 566 NaN 916197 4046157 NaN 1757552 465 -560222 5243487 ... NaN 29 864523 0 False NaN -5104 NaN james.bannantine@enron.com 39
BAXTER JOHN C 267102 NaN 1295738 5634343 6680544 1200000 3942714 NaN NaN 10623258 ... NaN NaN 2660303 NaN False NaN -1386055 1586055 NaN NaN
BAY FRANKLIN R 239671 NaN 260455 827696 NaN 400000 145796 NaN -82782 63014 ... NaN NaN 69 NaN False NaN -201641 NaN frank.bay@enron.com NaN

5 rows × 21 columns


In [61]:
df['salary'].unique()


Out[61]:
array([201955, 'NaN', 477, 267102, 239671, 80818, 231330, 213999, 216582,
       187922, 213625, 248546, 278601, 248017, 261516, 330546, 240189,
       261809, 415189, 288542, 314288, 184899, 206121, 365163, 492375,
       210500, 250100, 262788, 221003, 210692, 182245, 170941, 304588,
       440698, 199157, 1060932, 192008, 231946, 274975, 272880, 6615,
       374125, 243293, 262663, 211788, 130724, 85274, 288558, 275101,
       404338, 174246, 271442, 309946, 224305, 339288, 1072321, 273746,
       236457, 349487, 263413, 365038, 370448, 365788, 267093, 251654,
       229284, 329078, 94941, 261879, 655037, 197091, 96840, 76399, 420636,
       249201, 304110, 269076, 248146, 211844, 428780, 1111258, 239502,
       162779, 257486, 265214, 222093, 247338, 26704229, 288589, 357091,
       259996, 63744, 510364, 317543, 158403], dtype=object)

'NaN' was imported as a string instead of a a missing value. We will convert these to NaN type and look how many missing values our data has.


In [62]:
df = df.replace('NaN', np.nan)

In [63]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 146 entries, ALLEN PHILLIP K to YEAP SOON
Data columns (total 21 columns):
salary                       95 non-null float64
to_messages                  86 non-null float64
deferral_payments            39 non-null float64
total_payments               125 non-null float64
exercised_stock_options      102 non-null float64
bonus                        82 non-null float64
restricted_stock             110 non-null float64
shared_receipt_with_poi      86 non-null float64
restricted_stock_deferred    18 non-null float64
total_stock_value            126 non-null float64
expenses                     95 non-null float64
loan_advances                4 non-null float64
from_messages                86 non-null float64
other                        93 non-null float64
from_this_person_to_poi      86 non-null float64
poi                          146 non-null bool
director_fees                17 non-null float64
deferred_income              49 non-null float64
long_term_incentive          66 non-null float64
email_address                111 non-null object
from_poi_to_this_person      86 non-null float64
dtypes: bool(1), float64(19), object(1)

There is a lot of missing data!


In [13]:
print "NaN - Missing values:"
len(df.index)-df.count()


NaN - Missing values:
Out[13]:
salary                        51
to_messages                   60
deferral_payments            107
total_payments                21
exercised_stock_options       44
bonus                         64
restricted_stock              36
shared_receipt_with_poi       60
restricted_stock_deferred    128
total_stock_value             20
expenses                      51
loan_advances                142
from_messages                 60
other                         53
from_this_person_to_poi       60
poi                            0
director_fees                129
deferred_income               97
long_term_incentive           80
email_address                 35
from_poi_to_this_person       60
dtype: int64

First, check for potential invalid people in the dataset by looking at names without a " ".


In [64]:
[suspect for suspect in df.index if " " not in suspect]


Out[64]:
['TOTAL']

TOTAL is an aggregate category, and not a person's name. This should be removed.


In [65]:
df = df.drop('TOTAL', axis=0)

Next, we'll look at names of people who only have 3 or less feature entries (one of which is simply True/False for poi and not a feature) out of 21 features. One happens to be a Travel Agency, and others are missing nearly all entries as well.

These are good candidates for potential removal.


In [66]:
print [ind for ind in enumerate(df.T.count()) if ind[1] <= 3]

df.irow([56, 84, 127, 137, 142])


[(56, 3), (84, 1), (127, 3), (137, 3), (142, 3)]
Out[66]:
salary to_messages deferral_payments total_payments exercised_stock_options bonus restricted_stock shared_receipt_with_poi restricted_stock_deferred total_stock_value ... loan_advances from_messages other from_this_person_to_poi poi director_fees deferred_income long_term_incentive email_address from_poi_to_this_person
GRAMM WENDY L NaN NaN NaN 119292 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN False 119292 NaN NaN NaN NaN
LOCKHART EUGENE E NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN False NaN NaN NaN NaN NaN
THE TRAVEL AGENCY IN THE PARK NaN NaN NaN 362096 NaN NaN NaN NaN NaN NaN ... NaN NaN 362096 NaN False NaN NaN NaN NaN NaN
WHALEY DAVID A NaN NaN NaN NaN 98718 NaN NaN NaN NaN 98718 ... NaN NaN NaN NaN False NaN NaN NaN NaN NaN
WROBEL BRUCE NaN NaN NaN NaN 139130 NaN NaN NaN NaN 139130 ... NaN NaN NaN NaN False NaN NaN NaN NaN NaN

5 rows × 21 columns


In [67]:
#df.columns
#df = df.drop(['Name'], axis=1)
df = df.drop(['GRAMM WENDY L', 'THE TRAVEL AGENCY IN THE PARK', 'LOCKHART EUGENE E', 'WHALEY DAVID A', 'WROBEL BRUCE'], axis=0)

Email address is also not needed for this model as it is a unique string for each person.


In [68]:
df = df.drop(['email_address'], axis=1)

First, we must deal with the NaN's since many models don't like missing values. For a quick and dirty solution, we will just fill in 0's for missing values.

This is just to get a model up and running, and will be handled differently later.


In [19]:
# Cross-validation fills
df = df.fillna(0)

In [20]:
from sklearn.cross_validation import train_test_split

In [21]:
labels = df['poi']
features = df.drop('poi', axis=1)
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, 
                                                                            test_size=0.2,
                                                                            random_state=808)

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [23]:
param_grid = [{'C':[.0001, .001, .01, 0.1, 1, 10, 100, 1000], 'gamma': [10, 1, .1, .01, .001, .0001]}]

In [24]:
from sklearn import grid_search

In [25]:
svm_model = SVC()
clf = grid_search.GridSearchCV(svm_model, param_grid, n_jobs=4, scoring='f1')

In [25]:


In [26]:
clf.fit(features_train, labels_train)


Out[26]:
GridSearchCV(cv=None,
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=4,
       param_grid=[{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'gamma': [10, 1, 0.1, 0.01, 0.001, 0.0001]}],
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring='f1',
       verbose=0)

In [27]:
clf.best_estimator_


Out[27]:
SVC(C=0.0001, cache_size=200, class_weight=None, coef0=0.0, degree=3,
  gamma=10, kernel='rbf', max_iter=-1, probability=False,
  random_state=None, shrinking=True, tol=0.001, verbose=False)

In [28]:
clf.best_score_


Out[28]:
0.0

In [29]:
clf.scorer_


Out[29]:
make_scorer(f1_score)

In [30]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
rf = RandomForestClassifier(n_estimators=1000, n_jobs=4)

In [32]:
rf.fit(features_train, labels_train)


Out[32]:
RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=1000, n_jobs=4,
            oob_score=False, random_state=None, verbose=0)

In [33]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [34]:
pred = rf.predict(features_test)
print "Accuracy:", accuracy_score(labels_test, pred), '\n'
print "Confusion Matrix:\n", confusion_matrix(labels_test, pred), '\n'
print "Classification Report:", classification_report(labels_test, pred)


Accuracy: 0.964285714286 

Confusion Matrix:
[[26  0]
 [ 1  1]] 

Classification Report:              precision    recall  f1-score   support

      False       0.96      1.00      0.98        26
       True       1.00      0.50      0.67         2

avg / total       0.97      0.96      0.96        28


In [35]:
features = np.array(features)
labels = np.array(labels)

In [36]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.cross_validation import StratifiedKFold
skf = StratifiedKFold( labels, n_folds=3 )
precisions = []
recalls = []
for train_idx, test_idx in skf: 
    features_train = []
    features_test  = []
    labels_train   = []
    labels_test    = []
    for ii in train_idx:
        features_train.append( features[ii] )
        labels_train.append( labels[ii] )
    for jj in test_idx:
        features_test.append( features[jj] )
        labels_test.append( labels[jj] )
    
    ### fit the classifier using training set, and test on test set
    rf.fit(features_train, labels_train)
    pred = rf.predict(features_test)


    ### for each fold, print some metrics
    print
    print "precision score: ", precision_score( labels_test, pred )
    print "recall score: ", recall_score( labels_test, pred )

    precisions.append( precision_score(labels_test, pred) )
    recalls.append( recall_score(labels_test, pred) )

### aggregate precision and recall over all folds
print "average precision: ", sum(precisions)/3.
print "average recall: ", sum(recalls)/3.


precision score:  0.0
recall score:  0.0

precision score:  0.0
recall score:  0.0

precision score:  1.0
recall score:  0.166666666667
average precision:  0.333333333333
average recall:  0.0555555555556

In [ ]:
#%load ../ud120-projects/final_project/tester.py

In [37]:
#!/usr/bin/pickle

""" a basic script for importing student's POI identifier,
    and checking the results that they get from it 
 
    requires that the algorithm, dataset, and features list
    be written to my_classifier.pkl, my_dataset.pkl, and
    my_feature_list.pkl, respectively

    that process should happen at the end of poi_id.py

"""

import pickle
import sys
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit

### load up student's classifier, dataset, and feature_list
clf = pickle.load(open("my_classifier.pkl", "r") )
dataset = pickle.load(open("my_dataset.pkl", "r") )
feature_list = pickle.load(open("my_feature_list.pkl", "r"))

### print basic info about the algorithm/parameters used
print clf

### prepare data for training/testing
data = featureFormat(dataset, feature_list)
labels, features = targetFeatureSplit(data)



### stratified k-fold cross-validation is a form of 
### CV where instances of each class are equally apportioned--
### e.g. if you have 10% of one class and 90% of the other,
### stratification means each fold will have 10% of one
### class and 90% of the other
###
### this is helpful when you don't have a lot of instances
### of one class or the other, because in that case the 
### low-frequency class can become lopsided in the training-test
### split skew the results
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.cross_validation import StratifiedKFold
skf = StratifiedKFold( labels, n_folds=3 )
precisions = []
recalls = []
for train_idx, test_idx in skf: 
    features_train = []
    features_test  = []
    labels_train   = []
    labels_test    = []
    for ii in train_idx:
        features_train.append( features[ii] )
        labels_train.append( labels[ii] )
    for jj in test_idx:
        features_test.append( features[jj] )
        labels_test.append( labels[jj] )
    
    ### fit the classifier using training set, and test on test set
    clf.fit(features_train, labels_train)
    pred = clf.predict(features_test)


    ### for each fold, print some metrics
    print
    print "precision score: ", precision_score( labels_test, pred )
    print "recall score: ", recall_score( labels_test, pred )

    precisions.append( precision_score(labels_test, pred) )
    recalls.append( recall_score(labels_test, pred) )

### aggregate precision and recall over all folds
print "average precision: ", sum(precisions)/3.
print "average recall: ", sum(recalls)/3.








#print precision_score( labels_test, pred )
#print recall_score( labels_test, pred )


None
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-37-aaf8e574bb2a> in <module>()
     60 
     61     ### fit the classifier using training set, and test on test set
---> 62     clf.fit(features_train, labels_train)
     63     pred = clf.predict(features_test)
     64 

AttributeError: 'NoneType' object has no attribute 'fit'

In [38]:
df.head()


Out[38]:
salary to_messages deferral_payments total_payments exercised_stock_options bonus restricted_stock shared_receipt_with_poi restricted_stock_deferred total_stock_value expenses loan_advances from_messages other from_this_person_to_poi poi director_fees deferred_income long_term_incentive from_poi_to_this_person
ALLEN PHILLIP K 201955 2902 2869717 4484442 1729541 4175000 126027 1407 -126027 1729541 13868 0 2195 152 65 False 0 -3081055 304805 47
BADUM JAMES P 0 0 178980 182466 257817 0 0 0 0 257817 3486 0 0 0 0 False 0 0 0 0
BANNANTINE JAMES M 477 566 0 916197 4046157 0 1757552 465 -560222 5243487 56301 0 29 864523 0 False 0 -5104 0 39
BAXTER JOHN C 267102 0 1295738 5634343 6680544 1200000 3942714 0 0 10623258 11200 0 0 2660303 0 False 0 -1386055 1586055 0
BAY FRANKLIN R 239671 0 260455 827696 0 400000 145796 0 -82782 63014 129142 0 0 69 0 False 0 -201641 0 0

In [39]:
#df[(df.poi == True)].email_address


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-39-74f9d0d84870> in <module>()
----> 1 df[(df.poi == True)].email_address

c:\Anaconda\lib\site-packages\pandas\core\generic.pyc in __getattr__(self, name)
   1841                 return self[name]
   1842             raise AttributeError("'%s' object has no attribute '%s'" %
-> 1843                                  (type(self).__name__, name))
   1844 
   1845     def __setattr__(self, name, value):

AttributeError: 'DataFrame' object has no attribute 'email_address'

In [43]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 140 entries, ALLEN PHILLIP K to YEAP SOON
Data columns (total 20 columns):
salary                       140 non-null float64
to_messages                  140 non-null float64
deferral_payments            140 non-null float64
total_payments               140 non-null float64
exercised_stock_options      140 non-null float64
bonus                        140 non-null float64
restricted_stock             140 non-null float64
shared_receipt_with_poi      140 non-null float64
restricted_stock_deferred    140 non-null float64
total_stock_value            140 non-null float64
expenses                     140 non-null float64
loan_advances                140 non-null float64
from_messages                140 non-null float64
other                        140 non-null float64
from_this_person_to_poi      140 non-null float64
poi                          140 non-null bool
director_fees                140 non-null float64
deferred_income              140 non-null float64
long_term_incentive          140 non-null float64
from_poi_to_this_person      140 non-null float64
dtypes: bool(1), float64(19)

In [46]:
df.describe()


Out[46]:
salary to_messages deferral_payments total_payments exercised_stock_options bonus restricted_stock shared_receipt_with_poi restricted_stock_deferred total_stock_value expenses loan_advances from_messages other from_this_person_to_poi poi director_fees deferred_income long_term_incentive from_poi_to_this_person
count 140.000000 140.000000 140.000000 1.400000e+02 140.000000 140.000000 140.000000 140.000000 140.000000 140.000000 140.000000 140.000000 140.000000 140.000000 140.000000 140 140.000000 140.000000 140.000000 140.000000
mean 190744.492857 1273.942857 228434.971429 2.320163e+06 2133411.692857 695311.564286 893351.614286 722.685714 75515.557143 2991223.428571 36386.064286 599464.285714 373.971429 303166.835714 25.328571 0.1285714 9413.385714 -199217.078571 346585.200000 39.864286
std 197294.093628 2259.506178 763919.760540 8.965398e+06 4851626.749684 1245353.853038 2039916.501419 1085.974496 1320523.687009 6258245.359442 45551.830946 6890811.098023 1470.070002 1146367.057329 80.807030 0.3359269 30322.956715 613764.992934 694584.937036 75.042263
min 0.000000 0.000000 -102500.000000 0.000000e+00 0.000000 0.000000 -2604490.000000 0.000000 -1787380.000000 -44093.000000 0.000000 0.000000 0.000000 0.000000 0.000000 False 0.000000 -3504386.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 1.092718e+05 0.000000 0.000000 68704.500000 0.000000 0.000000 337227.250000 0.000000 0.000000 0.000000 0.000000 0.000000 0 0.000000 -39072.000000 0.000000 0.000000
50% 211816.000000 430.000000 0.000000 9.842120e+05 644048.000000 312500.000000 364374.000000 118.000000 0.000000 986016.500000 22614.000000 0.000000 18.500000 1038.500000 0.000000 0 0.000000 0.000000 0.000000 7.500000
75% 271801.500000 1689.250000 11840.750000 1.985668e+06 1735597.250000 812500.000000 857103.000000 1042.000000 0.000000 2372703.250000 54234.500000 0.000000 56.750000 150507.500000 14.000000 0 0.000000 0.000000 375304.000000 42.000000
max 1111258.000000 15149.000000 6426990.000000 1.035598e+08 34348384.000000 8000000.000000 14761694.000000 5521.000000 15456290.000000 49110078.000000 228763.000000 81525000.000000 14368.000000 10359729.000000 609.000000 True 137864.000000 0.000000 5145434.000000 528.000000

In [47]:
import matplotlib.pyplot as plt

In [81]:
plt.plot(df.salary.fillna(df.to_messages.median()))
plt.plot(df[df.poi==True].salary)


Out[81]:
[<matplotlib.lines.Line2D at 0x18478828>]

In [78]:
df.apply(lambda x: x.fillna(x.median()), axis=0).describe()


Out[78]:
salary to_messages deferral_payments total_payments exercised_stock_options bonus restricted_stock shared_receipt_with_poi restricted_stock_deferred total_stock_value expenses loan_advances from_messages other from_this_person_to_poi poi director_fees deferred_income long_term_incentive from_poi_to_this_person
count 140.000000 140.000000 140.000000 1.400000e+02 140.000000 140.000000 140.000000 140.000000 140.000000 140.000000 140.000000 140.000000 140.000000 140.000000 140.000000 140 140.000000 140.000000 140.000000 140.000000
mean 275759.392857 1741.042857 389495.521429 2.462458e+06 2521323.821429 1011382.992857 991022.871429 1008.307143 -47716.385714 3126094.750000 51680.242857 2556607.142857 389.785714 321222.285714 28.414286 0.1285714 102047.314286 -299054.821429 572741.271429 53.364286
std 145378.632921 2063.168503 720551.130069 8.935969e+06 4715631.457413 1116399.113561 2004813.796551 945.726877 1328398.493412 6203670.569959 37886.738249 6723421.969061 1466.149274 1141814.707275 79.922533 0.3359269 14144.632209 584705.403418 607619.669145 69.567235
min 477.000000 57.000000 -102500.000000 1.480000e+02 3285.000000 70000.000000 -2604490.000000 2.000000 -1787380.000000 -44093.000000 148.000000 400000.000000 12.000000 2.000000 0.000000 False 3285.000000 -3504386.000000 69223.000000 0.000000
25% 238740.750000 897.000000 221063.500000 5.610545e+05 758993.000000 700000.000000 338764.250000 583.000000 -140264.000000 643802.000000 32965.000000 2000000.000000 33.750000 2426.500000 5.000000 0 103750.000000 -151927.000000 422158.000000 25.000000
50% 258741.000000 1211.000000 221063.500000 1.106740e+06 1324578.000000 750000.000000 441096.000000 740.500000 -140264.000000 1110705.000000 46547.500000 2000000.000000 41.000000 51587.000000 8.000000 0 103750.000000 -151927.000000 422158.000000 35.000000
75% 271801.500000 1689.250000 221063.500000 1.985668e+06 1735597.250000 812500.000000 857103.000000 1042.000000 -140264.000000 2372703.250000 54234.500000 2000000.000000 56.750000 150507.500000 14.000000 0 103750.000000 -151927.000000 422158.000000 42.000000
max 1111258.000000 15149.000000 6426990.000000 1.035598e+08 34348384.000000 8000000.000000 14761694.000000 5521.000000 15456290.000000 49110078.000000 228763.000000 81525000.000000 14368.000000 10359729.000000 609.000000 True 137864.000000 -833.000000 5145434.000000 528.000000

In [74]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 140 entries, ALLEN PHILLIP K to YEAP SOON
Data columns (total 20 columns):
salary                       94 non-null float64
to_messages                  86 non-null float64
deferral_payments            38 non-null float64
total_payments               122 non-null float64
exercised_stock_options      99 non-null float64
bonus                        81 non-null float64
restricted_stock             109 non-null float64
shared_receipt_with_poi      86 non-null float64
restricted_stock_deferred    17 non-null float64
total_stock_value            123 non-null float64
expenses                     94 non-null float64
loan_advances                3 non-null float64
from_messages                86 non-null float64
other                        91 non-null float64
from_this_person_to_poi      86 non-null float64
poi                          140 non-null bool
director_fees                15 non-null float64
deferred_income              48 non-null float64
long_term_incentive          65 non-null float64
from_poi_to_this_person      86 non-null float64
dtypes: bool(1), float64(19)

In [101]:
plt.plot(df.long_term_incentive, 'ro')
plt.plot(df[df.poi==True].long_term_incentive, 'bo')


Out[101]:
[<matplotlib.lines.Line2D at 0x1b112400>]

In [108]:
df1 = df.drop(['deferral_payments', 'restricted_stock_deferred', 'loan_advances', 'director_fees'], axis=1)

In [117]:
f1 = df1.drop(['poi'], axis=1)
y1 = df['poi']

In [119]:
from sklearn.preprocessing import scale

In [122]:
f1 = f1.apply(lambda x: x.fillna(x.median()), axis=0)
f_scaled = scale(f1)

In [125]:
from sklearn.decomposition import RandomizedPCA

In [128]:
pca = RandomizedPCA(n_components=5, whiten=True).fit(f_scaled)

In [129]:
x_pca = pca.transform(f_scaled)

In [142]:
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
param_grid = {
         'C': [.01, .1, 1, 10, 100, 1e3, 5e3, 1e4, 5e4, 1e5],
          'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1, 1, 10, 100, 1000],
          }
clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid,n_jobs=4)
clf = clf.fit(x_pca, y1)

In [143]:
print clf.best_estimator_


SVC(C=0.1, cache_size=200, class_weight='auto', coef0=0.0, degree=3,
  gamma=0.0001, kernel='rbf', max_iter=-1, probability=False,
  random_state=None, shrinking=True, tol=0.001, verbose=False)

In [145]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
y_pred = clf.predict(x_pca)
print classification_report(y1, y_pred)
print confusion_matrix(y1, y_pred)


             precision    recall  f1-score   support

      False       0.87      1.00      0.93       122
       True       0.00      0.00      0.00        18

avg / total       0.76      0.87      0.81       140

[[122   0]
 [ 18   0]]

In [ ]: